import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
warnings.simplefilter('ignore')
pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')
print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()
print('Target 0:\t', target0, '\t', np.round(target0 / len(train), 4))
print('Target 1:\t', target1, '\t', np.round(target1 / len(train), 4))
print('0/1 Ratio:\t', np.round(target0 / target1, 4))
# visualize the target count distribution
data = [go.Bar(x=['status 0'], y=[target0], name='Status 0'),
go.Bar(x=['status 1'], y=[target1], name='Status 1')]
margin=go.layout.Margin(l=50, r=50, b=30, t=40, pad=4)
legend = dict(orientation='h', xanchor='auto', y=-0.2)
layout = go.Layout(title='Loan Status Count Plot', xaxis=dict(title='Loan Status'),
yaxis=dict(title='Count'), autosize=False, width=700, height=400,
margin=margin, legend=legend)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# define categorical and numerical features
cat_features = ['term', 'home_ownership', 'verification_status', 'purpose',
'title', 'addr_state', 'initial_list_status', 'application_type',
'grade', 'sub_grade']
num_features = ['loan_amnt', 'int_rate', 'installment_ratio', 'emp_length', 'annual_inc',
'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
'revol_bal', 'revol_util', 'total_acc', 'collections_12_mths_ex_med',
'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim',
'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
'total_il_high_credit_limit', 'credit_length']
features = cat_features + num_features
# define numerical and categorical features
print('Categorical feature:\t', len(cat_features))
print('Numerical feature:\t', len(num_features))
print('Total feature:\t\t', len(features))
def numerical_plot(data, feature, width=800, height=400, bins=50):
""" function to plot the numerical variable """
# make subplots
titles = ('Histogram Plot', 'Default Rate vs. ' + feature.capitalize())
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=titles)
# fig 1: histogram for different loan status
x0 = data[data['target']==0][feature]
x1 = data[data['target']==1][feature]
# find the minimum and maximum values
start = min(x0.min(), x1.min())
end = max(x0.max(), x1.max())
n_unique = len(data[feature].unique())
if n_unique <= min(end - start + 1, bins):
bin_size = 1
else:
bin_size = (end - start) / bins
# Group data together
hist_data = [x0, x1]
group_labels = ['Status 0', 'Status 1']
# Create distplot
fig1 = ff.create_distplot(hist_data=hist_data, group_labels=group_labels,
bin_size=bin_size, show_rug=False)
displot = fig1['data']
# add histgram into the final figure
fig.append_trace(displot[0], 1, 1)
fig.append_trace(displot[1], 1, 1)
fig.append_trace(displot[2], 1, 1)
fig.append_trace(displot[3], 1, 1)
# fig 2: default rate bar plot for each feature or scatter plot
if n_unique <= bins:
# default rate bar plot
means = data.groupby(feature)['target'].mean()
stds = data.groupby(feature)['target'].std()
names = list(means.index)
for name, mean, std in zip(names, means[names], stds[names]):
low, high = stats.norm.interval(0.05, loc=mean, scale=std)
er = mean - low
trace = go.Bar(x=[name], y=[mean], error_y=dict(array=[er], visible=True),
name=name, xaxis='x2')
fig.append_trace(trace, 1, 2)
else:
# scatter plot
mean = train.groupby(feature)['target'].mean()
sem = train.groupby(feature)['target'].sem().fillna(value=0)
index = mean.index
lower = go.Scatter(x=index, y=mean[index]-sem[index], mode='lines',
marker=dict(color="#444"), line=dict(width=0),
showlegend=False)
trace = go.Scatter(name='Default Rate', x=index, y=mean[index],
line=dict(color='rgb(31, 119, 180)', width=1),
fillcolor='rgba(68, 68, 68, 0.3)', mode='lines',)
upper = go.Scatter(x=index, y=mean[index]+sem[index], mode='lines',
marker=dict(color="#444"), line=dict(width=0),
fill='tonexty', fillcolor='rgba(68, 68, 68, 0.3)',
showlegend=False)
fig.append_trace(lower, 1, 2)
fig.append_trace(upper, 1, 2)
fig.append_trace(trace, 1, 2)
# layout setting
legend = dict(orientation='h', xanchor='auto', y=-0.2)
margin=go.layout.Margin(l=50, r=50, b=50, t=40, pad=4)
fig['layout'].update(xaxis=dict(domain=[0, 0.47]), xaxis2=dict(domain=[0.53, 1]),
yaxis2=dict(anchor='x2'), width=width, height=height,
margin=margin, legend=legend)
fig['layout']['xaxis1'].update(title=feature.capitalize())
fig['layout']['yaxis1'].update(title='Probability Density')
fig['layout']['xaxis2'].update(title=feature.capitalize())
fig['layout']['yaxis2'].update(title='Default Rate')
return fig
# loan_amnt
feature = 'loan_amnt'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
# int_rate
feature = 'int_rate'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
# installment_ratio
feature = 'installment_ratio'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)